#This script tabulates the top 10 articles for the synthetic articles
#It produces a LaTeX file with the articles for Appendix Section N
import pandas as pd
import re

# File paths
input_csv = "data/synthetic/endat_gpt-4o_scored.csv"
output_tex = "data/synthetic/endat_gpt-4o_scored_top10.tex"

# Read the scored articles
df = pd.read_csv(input_csv)

# Ensure necessary columns exist
if "Article" not in df.columns or "val" not in df.columns:
    raise ValueError("Required columns 'Article' or 'val' not found in the dataset.")

# Sort by cosine similarity score in descending order and select top 10
df_top10 = df.sort_values(by="val", ascending=False).head(10)

def extract_context(text, keyword="POLITFIG"):
    # Split text into sentences
    sentences = re.split(r'(?<=[.!?])\s+', text)
    
    # Find the first sentence containing keyword (case-insensitive)
    target_idx = next((i for i, s in enumerate(sentences) if keyword.lower() in s.lower()), None)
    if target_idx is None:
        return None  # No match found

    # Extract the sentence before, the target sentence, and the sentence after
    before = sentences[target_idx - 1] if target_idx > 0 else ""
    target = sentences[target_idx]
    after = sentences[target_idx + 1] if target_idx < len(sentences) - 1 else ""

    return f"{before} {target} {after}".strip()

with open(output_tex, "w", encoding="utf-8") as f:
    first_entry = True

    for idx, row in df_top10.iterrows():
        content = row["Article"]
        score = row["val"]
        doc_id = str(row.get("doc_id", f"Article_{idx+1}")).replace("_", r"\_")

        context_text = extract_context(content, keyword="POLITFIG")
        if context_text is None:
            # Optionally write something else if there's no match
            continue

        # Add a blank line between entries
        if not first_entry:
            f.write("\n\n")
        first_entry = False

        # Write doc info and context in LaTeX
        f.write(f"\\textcolor{{red}}{{{doc_id}: {score:.4f}}} ")
        f.write("\\begin{lstlisting}\n")
        f.write(context_text + "\n")
        f.write("\\end{lstlisting}")

print(f"LaTeX file generated: {output_tex}")
